{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  文本分类之 - 情感分析 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "import jieba # 分词\n",
    "import re # 正则\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_data(path, is_pos=None):\n",
    "    \"\"\"\n",
    "    给定文件的路径，读取文件\n",
    "    path: path to the data\n",
    "    is_pos: 是否数据是postive samples. \n",
    "    return: (list of review texts, list of labels) \n",
    "    \"\"\"\n",
    "    reviews, labels  = [], []\n",
    "    with open(path, 'r') as file:\n",
    "        review_start  = False\n",
    "        review_text = []\n",
    "        for line in file:\n",
    "            line = line.strip()\n",
    "            if not line: continue\n",
    "            if not review_start and line.startswith(\"<review\"):\n",
    "                review_start = True\n",
    "                if \"label\" in line:\n",
    "                    labels.append(int(line.split('\"')[-2]))\n",
    "                continue                \n",
    "            if review_start and line == \"</review>\":\n",
    "                review_start = False\n",
    "                reviews.append(\" \".join(review_text))\n",
    "                review_text = []\n",
    "                continue\n",
    "            if review_start:\n",
    "                review_text.append(line)\n",
    "    if is_pos:\n",
    "        labels = [1]*len(reviews)\n",
    "    elif not is_pos is None:\n",
    "        labels = [0]*len(reviews)\n",
    "    return reviews, labels\n",
    "\n",
    "\n",
    "def process_file():\n",
    "    \"\"\"\n",
    "    读取训练数据和测试数据，并对它们做一些预处理\n",
    "    \"\"\"    \n",
    "    train_pos_file = \"data_sentiment/train.positive.txt\"\n",
    "    train_neg_file = \"data_sentiment/train.negative.txt\"\n",
    "    test_comb_file = \"data_sentiment/test.combined.txt\"\n",
    "    \n",
    "    # 读取文件部分，把具体的内容写入到变量里面\n",
    "    train_pos_cmts, train_pos_lbs = read_data(train_pos_file, True)\n",
    "    train_neg_cmts, train_neg_lbs = read_data(train_neg_file, False)\n",
    "    train_comments = train_pos_cmts + train_neg_cmts\n",
    "    train_labels = train_pos_lbs + train_neg_lbs\n",
    "    test_comments, test_labels = read_data(test_comb_file)\n",
    "    return train_comments, train_labels, test_comments, test_labels\n",
    "train_comments, train_labels, test_comments, test_labels = process_file()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8064 2500\n",
      "手感超好，而且黑色相比白色在转得时候不容易眼花，找童年的记忆啦。 1\n"
     ]
    }
   ],
   "source": [
    "# 训练数据和测试数据大小\n",
    "print (len(train_comments), len(test_comments))\n",
    "\n",
    "print (train_comments[1], train_labels[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_stopwords(path):\n",
    "    \"\"\"\n",
    "    从外部文件中导入停用词\n",
    "    \"\"\"\n",
    "    stopwords = set()\n",
    "    with open(path, 'r') as in_file:\n",
    "        for line in in_file:\n",
    "            stopwords.add(line.strip())\n",
    "    return stopwords\n",
    "\n",
    "\n",
    "def clean_non_chinese_symbols(text):\n",
    "    \"\"\"\n",
    "    处理非中文字符\n",
    "    \"\"\"\n",
    "    text = re.sub('[!！]+', \"!\", text)\n",
    "    text = re.sub('[?？]+', \"?\", text)\n",
    "    text = re.sub(\"[a-zA-Z#$%&\\'()*+,-./:;：<=>@，。★、…【】《》“”‘’[\\\\]^_`{|}~]+\", \" UNK \", text)\n",
    "    return re.sub(\"\\s+\", \" \", text)  \n",
    "\n",
    "def clean_numbers(text):\n",
    "    \"\"\"\n",
    "    处理数字符号  128  190  NUM \n",
    "    \"\"\"\n",
    "    return re.sub(\"\\d+\", ' NUM ', text)\n",
    "\n",
    "def preprocess_text(text, stopwords):\n",
    "    \"\"\"\n",
    "    文本的预处理过程\n",
    "    \"\"\"\n",
    "    text = clean_non_chinese_symbols(text)\n",
    "    text = clean_numbers(text)\n",
    "    text = \" \".join([term for term in jieba.cut(text) if term and not term in stopwords])\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_stopwords = \"./data_sentiment/stopwords.txt\"\n",
    "stopwords = load_stopwords(path_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/pr/0fhkrt7s4cj8yygh6m87_1fw0000gn/T/jieba.cache\n",
      "Loading model cost 0.988 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "发短信 特别 不 方便 ! 背后 屏幕 很大 起来 不 舒服   UNK   手触 屏 ! 切换 屏幕 很 麻烦 ! 终于 找到 同道中人 初中   UNK   已经 喜欢 上   UNK   同学 都 鄙夷 眼光 看   UNK   人为   UNK   样子 古怪 说 ＂ 丑 ＂ 当场 气晕 现在 同道中人   UNK   好开心 !   UNK   !   UNK  \n"
     ]
    }
   ],
   "source": [
    "# 对于train_comments, test_comments进行字符串的处理，几个考虑的点：\n",
    "#   1. 停用词过滤\n",
    "#   2. 去掉特殊符号\n",
    "#   3. 去掉数字（比如价格..)\n",
    "#   4. ...\n",
    "#   需要注意的点是，由于评论数据本身很短，如果去掉的太多，很可能字符串长度变成0\n",
    "#   预处理部部分，可以自行选择合适的方案，只要注释就可以。\n",
    "\n",
    "train_comments_new = [preprocess_text(comment, stopwords) for comment in train_comments]\n",
    "test_comments_new = [preprocess_text(comment, stopwords) for comment in test_comments]\n",
    "\n",
    "print (train_comments_new[0], test_comments_new[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8064, 23101) (2500, 23101) (8064,) (2500,)\n"
     ]
    }
   ],
   "source": [
    "#   利用tf-idf从文本中提取特征,写到数组里面. \n",
    "#   参考：https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n",
    "tfidf = TfidfVectorizer()\n",
    "X_train =  tfidf.fit_transform(train_comments_new) # 训练数据的特征\n",
    "y_train =  train_labels # 训练数据的label\n",
    "X_test = tfidf.transform(test_comments_new) # 测试数据的特征\n",
    "y_test = test_labels# 测试数据的label\n",
    "\n",
    "print (np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy on test data:  0.6368\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "clf = MultinomialNB()\n",
    "# 利用朴素贝叶斯做训练\n",
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)\n",
    "print(\"accuracy on test data: \", accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy on test data:  0.524\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "clf = KNeighborsClassifier(n_neighbors=1)\n",
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)\n",
    "print(\"accuracy on test data: \", accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "normalizer = StandardScaler()  # data is no longer sparse\n",
    "X_train_normalized = normalizer.fit_transform(X_train.toarray())\n",
    "X_test_normalized = normalizer.transform(X_test.toarray())\n",
    "\n",
    "knn = KNeighborsRegressor(n_neighbors=3)\n",
    "knn.fit(X_train_normalized, y_train)\n",
    "\n",
    "#Now we can predict prices:\n",
    "y_pred = knn.predict(X_test_normalized)\n",
    "print(\"accuracy on test data: \", accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy on test data:  0.7136\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "clf = LogisticRegression(solver='liblinear')\n",
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)\n",
    "print(\"accuracy on test data: \", accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
